import time, math, csv, datetime
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import lightgbm as lgb
import numpy as np
import sklearn
from pyspark.sql import SparkSession
def agefromdob(df):
ages = []
for x in df['dob']:
xage = pd. to_datetime('today'). year-pd. to_datetime(x). year
ages.append(xage)
df['age'] = ages
df_train = pd.read_csv('fraudtrain.csv', header=0)
df_test = pd.read_csv('fraudtest.csv', header=0)
df_train
| Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2019-01-01 00:00:18 | 2703186189652095 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
| 1 | 1 | 2019-01-01 00:00:44 | 630423337322 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
| 2 | 2 | 2019-01-01 00:00:51 | 38859492057661 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
| 3 | 3 | 2019-01-01 00:01:16 | 3534093764340240 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
| 4 | 4 | 2019-01-01 00:03:06 | 375534208663984 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1296670 | 1296670 | 2020-06-21 12:12:08 | 30263540414123 | fraud_Reichel Inc | entertainment | 15.56 | Erik | Patterson | M | 162 Jessica Row Apt. 072 | ... | 37.7175 | -112.4777 | 258 | Geoscientist | 1961-11-24 | 440b587732da4dc1a6395aba5fb41669 | 1371816728 | 36.841266 | -111.690765 | 0 |
| 1296671 | 1296671 | 2020-06-21 12:12:19 | 6011149206456997 | fraud_Abernathy and Sons | food_dining | 51.70 | Jeffrey | White | M | 8617 Holmes Terrace Suite 651 | ... | 39.2667 | -77.5101 | 100 | Production assistant, television | 1979-12-11 | 278000d2e0d2277d1de2f890067dcc0a | 1371816739 | 38.906881 | -78.246528 | 0 |
| 1296672 | 1296672 | 2020-06-21 12:12:32 | 3514865930894695 | fraud_Stiedemann Ltd | food_dining | 105.93 | Christopher | Castaneda | M | 1632 Cohen Drive Suite 639 | ... | 32.9396 | -105.8189 | 899 | Naval architect | 1967-08-30 | 483f52fe67fabef353d552c1e662974c | 1371816752 | 33.619513 | -105.130529 | 0 |
| 1296673 | 1296673 | 2020-06-21 12:13:36 | 2720012583106919 | fraud_Reinger, Weissnat and Strosin | food_dining | 74.90 | Joseph | Murray | M | 42933 Ryan Underpass | ... | 43.3526 | -102.5411 | 1126 | Volunteer coordinator | 1980-08-18 | d667cdcbadaaed3da3f4020e83591c83 | 1371816816 | 42.788940 | -103.241160 | 0 |
| 1296674 | 1296674 | 2020-06-21 12:13:37 | 4292902571056973207 | fraud_Langosh, Wintheiser and Hyatt | food_dining | 4.30 | Jeffrey | Smith | M | 135 Joseph Mountains | ... | 45.8433 | -113.8748 | 218 | Therapist, horticultural | 1995-08-16 | 8f7c8e4ab7f25875d753b422917c98c9 | 1371816817 | 46.565983 | -114.186110 | 0 |
1296675 rows × 23 columns
categoricalfeatures = [ 'city', 'merchant', 'job', 'category', 'state', 'first', 'last', 'zip']
#Encode Categorical Features
for x in categoricalfeatures:
df_test[x] = pd.Categorical(df_test[x])
df_train[x] = pd.Categorical(df_train[x])
#Get the age of the individual
#Poses issues as it is constantly changing.
agefromdob(df_train)
agefromdob(df_test)
df_train['is_fraud'].value_counts()
0 1289169 1 7506 Name: is_fraud, dtype: int64
#Define Features
features = ['lat', 'long','amt','city_pop', 'merch_lat', 'merch_long', 'city', 'merchant', 'job', 'category', 'age', 'zip', 'unix_time']
#Create Datasets
train_data = lgb.Dataset(df_train[features], df_train['is_fraud'],feature_name = 'auto')
test_data = lgb.Dataset(df_test[features] , df_test['is_fraud'], feature_name = 'auto')
#Define Parameters
params = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.03, 'num_leaves': 300, 'boosting': 'dart', 'num_threads': 8, 'num_iterations': 200, 'max_bin': 100}
evals = {}
#Define Model
model = lgb.train(params, train_data, valid_sets=[train_data, test_data], valid_names=['train', 'test'], callbacks=[lgb.record_evaluation(evals)])
/Users/Lynden/opt/anaconda3/envs/jupyter/lib/python3.10/site-packages/lightgbm/engine.py:177: UserWarning: Found `num_iterations` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
[LightGBM] [Info] Number of positive: 7506, number of negative: 1289169 [LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013362 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 3599 [LightGBM] [Info] Number of data points in the train set: 1296675, number of used features: 13 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005789 -> initscore=-5.146050 [LightGBM] [Info] Start training from score -5.146050
#Metrics
lgb.plot_metric(evals)
<AxesSubplot: title={'center': 'Metric during training'}, xlabel='Iterations', ylabel='auc'>
#Get Predictions on Test Dataset
pred = model.predict(
df_test[features]
)
pred_label = np.round(pred)
#Calculate Accuracy
print("Accuracy: " + str(np.sum(pred_label == df_test['is_fraud']) / pred_label.shape[0]))
Accuracy: 0.9965450164561586
#Feature Importance
lgb.plot_importance(model)
#Confusion Matrix
confusion = sklearn.metrics.confusion_matrix(df_test['is_fraud'], pred_label)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion)
disp.plot()
#F1 Score
f1score = sklearn.metrics.f1_score(df_test['is_fraud'], pred_label)
print("f1: " + str(f1score))
f1: 0.4816414686825054